/*  
    Context: Jacome et al survey data is cross-sectional. ANES data is sometimes cross-sectional
             and sometimes a panel. 1956-1960, for example, include both cross-sectional and 
             panel respondents. 
    Purpose: Tag which panel respondents in 1958 and 1960 should be kept in the ANES sample. 
    Creates: keepPRs_1958.dta
             keepPRs_1960.dta
*/

clear 
set more off

cd "$Mydirectory1/1_DataSources/ANES/"  

********************************************************************************
* CLEAN RELEVANT VARIABLES IN 1956-1960 SECTIONS
********************************************************************************

**************
*** 1956 
**************

use "./RawData/NES1956.dta", clear //download from ANES website
gen year=1956

keep year V560002 V560298 V560190

* Father occupation
rename V560298 fatherocc

bysort fatherocc: gen nvals = _n ==1
count if nvals 

* Rename other variables
    rename V560002 id
    rename V560190 fam_inc_raw

/*
    Note: Per ANES documentation 
    (https://electionstudies.org/wp-content/uploads/2018/12/anes_timeseries_cdf_codebook_var.pdf), 
    a unique id is created by combining year + id variable. 
*/
tostring year id, gen(stryear strid_temp)
egen id_temp = concat(stryear strid_temp)
destring id_temp, gen(id_anes)

replace fam_inc_raw =. if fam_inc_raw>=97 //dk, refused, na

* Crosswalk father occupation to coasened ANES occupations
foreach x of varlist fatherocc {
merge m:1 `x' using "../Crosswalks/Crosswalk_1956_ANES.dta"
drop if _merge==2
drop _merge
}

duplicates report id_anes //no duplicates
drop stryear *id_temp 
ren id TS56ID 
sort id_anes
order id_anes year TS56ID 

tempfile ANES56 
save `ANES56' 

**************
*** 1958 
**************

use "./RawData/NES1958.dta", clear //download from ANES website
gen year=1958

keep year V580002 V580008 V580257 V580174 V580203 

* Father occupation
rename V580257 fatherocc

bysort fatherocc: gen nvals = _n ==1
count if nvals 

* Rename the other variables
    rename V580002 id
    rename V580203 fam_inc_raw
    rename V580174 ageR

/*IMPORTANT STEP: Create a variable that will tag panel respondents who
                  do not meet the inclusion restrictions for 1958.

                  Criteria for inclusion in 1958:  
                  (1) R is a cross section respondent or 
                  (2) if R was 28 or 29 years old in 1956 study. (Would have been 
                  dropped from sample in 1956 bc outside 30-50 age range).
*/

//Preliminary step
gen ageR1956 = ageR - 2
tab ageR1956, m

gen tag1958 = (V580008~=1 & (ageR1956~=28 & ageR1956~=29))

/*
    Note: Construct unique id. Per ANES documentation 
    (https://electionstudies.org/wp-content/uploads/2018/12/anes_timeseries_cdf_codebook_var.pdf), 
    this is done by combining year + id variable. 
*/

tostring year id, gen(stryear strid_temp)
egen id_temp = concat(stryear strid_temp)
destring id_temp, gen(id_anes)

* Crosswalk father occupation to coasened ANES occupations
foreach x of varlist fatherocc {
merge m:1 `x' using "../Crosswalks/Crosswalk_1958_ANES.dta"
drop if _merge==2
drop _merge
}

replace fam_inc_raw =. if fam_inc_raw>=95 //dk, refused, na

duplicates report id_anes //no duplicates
drop stryear *id_temp ageR1956
ren id TS58ID 
sort id_anes
order id_anes year TS58ID 

tempfile ANES58 
save `ANES58' 

**************
*** 1960 
**************

use "./RawData/NES1960.dta", clear //download from ANES website
gen year=1960

keep year V600002 V600015 V600180 V600122 V600189

* Father occupation
rename V600180 fatherocc

bysort fatherocc: gen nvals = _n ==1
count if nvals 

* Rename other variables
    rename V600002 id
    rename V600122 ageR
    rename V600189 fam_inc_raw 

/*IMPORTANT STEP: Create a variable that will tag panel respondents who
                  do not meet the inclusion restrictions for 1960.

                  Criteria for inclusion in 1960: 
                  1) R is a cross section respondent in 1960 or
                  2) R was 26 or 27 years old in 1956 study 
                     (and would have been dropped from our sample 
                     in 1956 and 1958 samples bc outside 30-50 age range), or 
                  3) R was 28 or 29 years old in 1958 and only appears 
                     in panel in 1958 + 1960 (and would have been dropped from 
                     our sample in 1958 samples bc outside 30-50 age range), or
                  4) if R only appears in panel in 1956 and 1960 and was 28 
                     or 29 in 1956. For these Rs that were dropped, the 1960 study 
                     was the first time that they entered our sample of Rs aged 30-50. 
*/

//Preliminary steps
gen ageR1956 = ageR - 4
tab ageR1956, m

gen ageR1958 = ageR - 2
tab ageR1958, m

gen a = (V600015==9 | ((ageR1958==28 | ageR1958==29) & V600015==6) | ((ageR1956==26 | ageR1956==27) & inrange(V600015,1,5)) | ((ageR1956==28 | ageR1956==29) & inlist(V600015,2,5)))

gen tag1960 = (a==0)

/*
    Note: Construct unique id. Per ANES documentation 
    (https://electionstudies.org/wp-content/uploads/2018/12/anes_timeseries_cdf_codebook_var.pdf), 
    this is done by combining year + id variable. 
*/
tostring year id, gen(stryear strid_temp)
egen id_temp = concat(stryear strid_temp)
destring id_temp, gen(id_anes)

* Crosswalk father occupation to coasened ANES occupations
foreach x of varlist fatherocc {
merge m:1 `x' using "../Crosswalks/Crosswalk_1960_ANES.dta"
drop if _merge==2
drop _merge
}

replace fam_inc_raw =. if fam_inc_raw>=95 //dk, refused, na

duplicates report id_anes //no duplicates
drop stryear *id_temp ageR1956 ageR1958
ren id TS60ID 
sort id_anes
order id_anes year TS60ID 

tempfile ANES60 
save `ANES60' 

********************************************************************************
********************************************************************************

* Append cleaned sections
append using `ANES58'
append using `ANES56'

* Label new variables
label var year "Year"
label var fatheroccej "Father's occ while R grew up, crosswalk"
label var id_anes "R ID (unique identifier)"

drop a nvals

/* IMPORTANT: Tag respondents who will will not be 
              part of any eventual regression analysis 
              sample because they do not have two pieces 
              of info: father occupation and adult child 
              family income.
*/

gen tagmiss = (fatheroccej ==. | fam_inc_raw==.) 
tab tagmiss, m

keep id_anes year TS56ID TS58ID TS60ID fatheroccej fam_inc_raw tagmiss tag1958 tag1960 V580008 V600015 ageR
order id_anes year TS56ID TS58ID TS60ID fatheroccej fam_inc_raw tagmiss tag1958 tag1960 V580008 V600015 ageR

sort id_anes

*****************************************************************
* SAVE A TEMPFILE FOR EACH SURVEY YEAR
*****************************************************************

    //1956
    preserve
    keep if year==1956
    sort TS56ID 
    count

    tempfile missing1956
    save `missing1956'
    restore

    //1958
    preserve
    keep if year==1958
    sort TS58ID 
    count 

    tempfile missing1958
    save `missing1958'
    restore
    
    //1960
    preserve
    keep if year==1960
    sort TS60ID 
    count 

    tempfile missing1960
    save `missing1960'
    restore

**************************************************************************************************
* BY YEAR: SAVE ONLY PANEL RESPONDENTS IN ANES ID FILE & MERGE ON CLEANED TEMPFILE
**************************************************************************************************
/* NOTE: See https://electionstudies.org/1956-1960-panel-study-id-file/ 
         for explanation of how to read ANES id file. */ 

***********
* 1956
***********
clear 
set more off

cd "$Mydirectory1/"

    import excel using ./1_DataSources/Crosswalks/anes_panel_1956to1960_idfile.xls, cellrange(B1:O2548) firstrow clear

    //Drop cross-section only Rs
    //1960
    drop if case1960TS==1 & (PRE60wave==1 | POST60wave==1) & PRE56wave==0 & POST56wave==0 & POST58wave==0 & case1956TS==0 & case1958TS==0
    //1958
    drop if case1958TS==1 & POST58wave==1 & PRE56wave==0 & POST56wave==0 & PRE60wave==0 & POST60wave==0 & case1956TS==0 & case1960TS==0
    //1956
    drop if case1956TS==1 & (PRE56wave==1 | POST56wave==1) & POST58wave==0 & PRE60wave==0 & POST60wave==0 & case1958TS==0 & case1960TS==0 
    
    sort TS56ID
    

    merge 1:1  TS56ID using `missing1956'
    
    drop if _merge==1 //unmatched from ANES id file
    drop _merge
    
    tempfile temp56
    save `temp56'
    
*----------------------------*
*----------------------------*

***********
* 1958
***********

clear 
set more off

cd "$Mydirectory1/"

    import excel using ./1_DataSources/Crosswalks/anes_panel_1956to1960_idfile.xls, cellrange(B1:O2548) firstrow clear

    //Drop cross-section only Rs
    //1960
    drop if case1960TS==1 & (PRE60wave==1 | POST60wave==1) & PRE56wave==0 & POST56wave==0 & POST58wave==0 & case1956TS==0 & case1958TS==0
    //1958
    drop if case1958TS==1 & POST58wave==1 & PRE56wave==0 & POST56wave==0 & PRE60wave==0 & POST60wave==0 & case1956TS==0 & case1960TS==0
    //1956
    drop if case1956TS==1 & (PRE56wave==1 | POST56wave==1) & POST58wave==0 & PRE60wave==0 & POST60wave==0 & case1958TS==0 & case1960TS==0 
    
    /*Note: Multiple obervations are given "0" for TS58ID when 
            a panel respondent is present in 1956 + 1960 but not 
            in 1958. Will change to unique (-) values so that 
            1:1 merge will work correctly. */
    replace TS58ID = -1*_n if TS58ID==0 
    duplicates report TS58ID //no duplicates
    
    sort TS58ID
    
    merge 1:1  TS58ID using `missing1958'
    
    drop if _merge==1 //unmatched from ANES id file
    drop _merge
    
    tempfile temp58
    save `temp58'

*----------------------------*
*----------------------------*

***********
* 1960
***********

clear 
set more off

cd "$Mydirectory1/"

    import excel using ./1_DataSources/Crosswalks/anes_panel_1956to1960_idfile.xls, cellrange(B1:O2548) firstrow clear

    //Drop cross-section only Rs
    //1960
    drop if case1960TS==1 & (PRE60wave==1 | POST60wave==1) & PRE56wave==0 & POST56wave==0 & POST58wave==0 & case1956TS==0 & case1958TS==0
    //1958
    drop if case1958TS==1 & POST58wave==1 & PRE56wave==0 & POST56wave==0 & PRE60wave==0 & POST60wave==0 & case1956TS==0 & case1960TS==0
    //1956
    drop if case1956TS==1 & (PRE56wave==1 | POST56wave==1) & POST58wave==0 & PRE60wave==0 & POST60wave==0 & case1958TS==0 & case1960TS==0 
    
    sort TS60ID
    
    merge 1:1  TS60ID using `missing1960'
    
    drop if _merge==1 //unmatched from ANES id file
    drop _merge
    
    append using `temp58'
    append using `temp56'
    count 

    order id_anes year TS56ID PAN56ID TS58ID PAN58ID TS60ID PAN60ID 
    sort TS56ID year

**************************************************************************************************
**************************************************************************************************

/* Some panel respondents are missing at least one piece
   of income information in one year but has both pieces
   in a subsequent year. Pinpoint which year should be kept 
   for each panel respondent (that meets other 
   aforementioned restrictions in the given year).
*/
    
    /*By id: find total number of years where both 
             piece of income info are not available.
             TS56ID will be used because it is available
             in all years. */
    bysort TS56ID: egen totaltagmiss = total(tagmiss), missing 
    order total*, after(tagmiss)
    tab total*, m
     
    //Count the # of observations per id. 
    egen total = count(tagmiss), by(TS56ID) //no "." in tagmiss
    order total, after(totaltagmiss)

    tab total if totaltagmiss==1, m 
    tab total if totaltagmiss==2, m 
    
    /*Keep only ids where there's at least one year with
      missing income info and one year without. */
    keep if (totaltagmiss==1 & total>1) | (totaltagmiss==2 & total>2) 
    
    egen target58 = min(tag1958), by(TS56ID)  
    egen target60 = min(tag1960), by(TS56ID) 
    order target*, after(tag1960)
    
    drop if (target58==1 & target60==1) | (target58==. & target60==1) | (target58==1 & target60==.) 
    /*Note: These panel respondents will be dropped in 1_ANES cleaner/2_ANES_InterGenMobility do files 
            because they do not meet age restrictions. */
  
    * Dummy: panel respondent has all income info in a given year
    bysort TS56ID: gen completeinfo = (fatheroccej~=. & fam_inc_raw~=.) 
    order complete*, after(fam_inc_raw)
    
    egen agemin = min(ageR), by(TS56ID)

    keep if inrange(agemin,26,50) //Respondents outside this age range will be dropped in 2_ANES_InterGenMobility.do
    
    *****************************************
    /* TAG FIRST YEAR THAT PANEL RESPONDENT 
       HAS ALL INCOME INFO. */
    *****************************************
    sort TS56ID year
    egen keepPR = tag(TS56ID completeinfo) if (completeinfo==1 & (tag1958==0 | tag1960==0)) | (completeinfo==1 & year==1956)
    
    by TS56ID: egen totalt = total(keepPR==1) //confirmed: one respondent tagged per id

    gen keepPR_1956 = (keepPR==1) if year==1956
    gen keepPR_1958 = (keepPR==1) if year==1958
    gen keepPR_1960 = (keepPR==1) if year==1960

    tab keepPR_1956, m
    tab keepPR_1958, m
    tab keepPR_1960, m
    
    keep if (keepPR_1956==1 | keepPR_1958==1 | keepPR_1960==1)
    keep id_anes keepPR_1956 keepPR_1958 keepPR_1960
    
    sort id_anes
   
************************************************
* SAVE (for merging onto ANES cleaner file)
************************************************

    //1958 
    preserve
    keep id_anes keepPR_1958
    sort id_anes
    save ./1_DataSources/ANES/output/keepPRs_1958.dta, replace
    restore

    //1960 
    preserve
    keep id_anes keepPR_1960
    sort id_anes
    save ./1_DataSources/ANES/output/keepPRs_1960.dta, replace  
    restore
